For our project we decided to analyze the “airlines customer satisfaction” dataset available on kaggle. This dataset contains information on passangers who have already flown with the airline that provided this data. These cover general characteristics of the passenger (age, gender, etc.), and of the type of flight (distance, class, etc.), feedback on different aspects of the experience the customers had, as well as their level of satisfaction in general.
As the goal of our work, we tried to extract insights that might be of interest to the company that owns the data in order to improve their service and strategy. With this idea, we structured the analysis in this way: 1) Exploratory data analysis, to understand and visualize the data provided; 2) Customer segmentation, in order to identify their different characteristics and enable the company to better target its efforts; 3) Creation of a model to predict whether or not a passenger will be satisfied given the variables examined
Rewrite the outline
Data upload
Exploratory analysis:
Unsupervised Learning
Logistic regression:
Linear Discriminant Analysis:
k-NN
Tree predictors:
set.seed(123)
library(tidyverse)
library(dplyr)
library(ggplot2)
library(grid)
library(gridExtra)
library(corrr)
library(ggradar)
library(scales)
library(tibble)
library(caret)
library(factoextra)
df <- read_csv("dataset/airline_passenger_satisfaction.csv")
Give more compact names to columns:
df <- df %>% rename(gender = Gender, cust_type = customer_type,
trav_type = type_of_travel,
class = customer_class,
distance = flight_distance,
wifi = inflight_wifi_service,
time_conv = departure_arrival_time_convenient,
onl_book = ease_of_online_booking,
gate_loc = gate_location,
food = food_and_drink,
onl_board = online_boarding,
comfort = seat_comfort,
entert = inflight_entertainment,
onb_serv = onboard_service,
leg_room = leg_room_service,
baggage = baggage_handling,
checkin = checkin_service,
infl_serv = inflight_service,
clean = cleanliness,
dep_delay = departure_delay_in_minutes,
arr_delay = arrival_delay_in_minutes,
sat = satisfaction)
The zero rating corresponds to a missing answer for that variable. For this reason we decided to remove the rows where a zero appears.
df_filtered <- df[!apply(df[, 8:21] == 0, 1, any), ]
df <- df_filtered
Remove the index column
df <- df[-1]
# Function that counts NA values for each column
na_counts <- sapply(df, function(x) sum(is.na(x)))
# Calculate the percentage of NA values for each column
na_percentage <- na_counts / nrow(df) * 100
# Display a table with NA counts and percentages
na_table <- data.frame(Column = names(na_counts), NA_Count = na_counts, Percentage = na_percentage, row.names = NULL)
# na_table <- na_table[-1,]
na_table
Missing values are present only for the column arrival delay in minutes for just a little bit more than 0.3% of the observations. We check if, as common sense might tell us, there is correlation between departure delay in minutes.
correlation_dep_arr <- cor(df$dep_delay, df$arr_delay, use = "complete.obs")
correlation_dep_arr
## [1] 0.9650396
Given the high correlation we decided to drop the arr_delay column and remove in this way the NA values.
df <- df %>% select(- arr_delay)
The majority of variables take values on a fixed scale. The variables worth checking for outliers are: departure delay in minutes and flight distance.
plot(df$dep_delay, main = "Departure Delay",
xlab = "Index", ylab = "Departure delay in minutes", col = "darkgreen")
z_scores_dep <- scale(df$dep_delay)
outliers_dep <- which(abs(z_scores_dep) > 3)
sd_outliers_count_dep <- length(outliers_dep)
cat("Observations that falls over 3 times the sd (departure delay):", sd_outliers_count_dep, "\n")
## Observations that falls over 3 times the sd (departure delay): 2525
cat("Percentage of outliers identified (departure delay):",
round(sd_outliers_count_dep / length(df$dep_delay), 4) * 100, "%", "\n")
## Percentage of outliers identified (departure delay): 2.11 %
Over 2% of the observations are identified as outliers using the standard deviation method.
plot(df$distance, main = "Flight Distance",
xlab = "Index", ylab = "Flight distance", col = "darkblue")
z_scores_dist <- scale(df$distance)
outliers_dist <- which(abs(z_scores_dist) > 3)
sd_outliers_count_dist <- length(outliers_dist)
cat("Observations that falls over 3 times the sd (flight distance):", sd_outliers_count_dist, "\n")
## Observations that falls over 3 times the sd (flight distance): 73
cat("Percentage of outliers identified (flight distance):", round(sd_outliers_count_dist / length(df$distance), 4) * 100, "%", "\n")
## Percentage of outliers identified (flight distance): 0.06 %
We eventually decided to not remove any outliers.
p_satisf <- ggplot(df, aes(x = sat)) +
geom_bar(aes(fill = sat)) +
geom_text(aes(y = ..count..,
label = paste0(round(prop.table(..count..), 4) * 100, '%')),
stat = 'count',
position = position_dodge(0.3),
size = 4,
vjust = - 0.5) +
scale_fill_manual(values = c("red", "green")) +
labs(title = "Satisfaction Distribution", y = "Count")
p_satisf
Define a function to plot bar charts
# Plot bar chart function
plot_bar_chart <- function(df, var_name) {
var_name <- enquo(var_name) # Defuse the var_name so it can be evaluated later
plot_var <- ggplot(df, aes(x = !!var_name)) + # !! to inject the var_name back to evaluate it
geom_bar(aes(fill = sat)) +
geom_text(aes(y = ..count..,
label = paste0(round(prop.table(..count..), 4) * 100, '%')),
stat = 'count',
position = position_dodge(0.3),
size = 4,
vjust = -0.5) +
scale_fill_manual(values = c("red", "green")) +
labs(title = paste0("Satisfaction Distribution for ", quo_name(var_name)),
x = quo_name(var_name), y = "Count") # quo_name() to return the var_name as string
print(plot_var)
}
# Gender
plot_bar_chart(df, gender)
# Age
df$age_group <- cut(df$age, breaks = seq(0, max(df$age) + 10, by = 10), right = FALSE)
plot_bar_chart(df, age_group)
# Customer type
plot_bar_chart(df, cust_type)
# Type of travel
plot_bar_chart(df, trav_type)
# Customer class
plot_bar_chart(df, class)
Dropping group_age because it is not needed anymore (we will only use age)
df <- df%>% select(-age_group)
# Inflight Wifi service
plot_bar_chart(df, wifi)
# Departure/arrival time convenient
plot_bar_chart(df, time_conv)
# Ease of online booking
plot_bar_chart(df, onl_book)
# Gate location
plot_bar_chart(df, gate_loc)
# Food and drink
plot_bar_chart(df, food)
# Online boarding
plot_bar_chart(df, onl_board)
# Seat comfort
plot_bar_chart(df, comfort)
# Inflight entertainment
plot_bar_chart(df, entert)
# Onboard service
plot_bar_chart(df, onb_serv)
# Leg room service
plot_bar_chart(df, leg_room)
# Baggage handling
plot_bar_chart(df, baggage)
# Check-in service
plot_bar_chart(df, checkin)
# Inflight service
plot_bar_chart(df, infl_serv)
# Cleanliness
plot_bar_chart(df, clean)
# Flight distance
p_distance <- ggplot(data = df, aes(distance, color = sat)) +
geom_freqpoly(binwidth = 100, linewidth = 1) +
scale_color_manual(values = c("red", "green")) +
labs(title = "Satisfaction distribution for Flight distance")
p_distance
# Departure delay
# We selected delays under 200 min otherwise the graph look even worse than this
p_dep_del <- ggplot(data = df[df$dep_delay < 200, ],
aes(dep_delay, color = sat)) +
geom_freqpoly(binwidth = 15, linewidth = 0.7) +
scale_color_manual(values = c("red", "green")) +
labs(title = "Satisfaction distribution for Departure delay")
p_dep_del
To display the correlation matrix we first encoded the variables with one-hot encoding.
df$class_Eco <- ifelse(df$class =="Eco", 1, 0)
df$class_Eco_plus <- ifelse(df$class == "Eco Plus", 1, 0)
df$sat <- ifelse(df$sat == "satisfied", 1, 0)
df$gender <- ifelse(df$gender == "Male", 1, 0)
df$cust_type <- ifelse(df$cust_type == "Loyal Customer", 1, 0)
df$class <- ifelse(df$class == 'Eco', 1, 0)
df$trav_type <- ifelse(df$trav_type == 'Personal Travel', 1, 0)
df <- df %>% select(- class)
df <- df %>% relocate(class_Eco, .after = age)
df <- df %>% relocate(class_Eco_plus, .after = class_Eco)
df
correlation_matrix <- df %>%
cor()
round(correlation_matrix, 3)
## gender cust_type age class_Eco class_Eco_plus trav_type
## gender 1.000 0.031 0.007 -0.003 -0.011 0.009
## cust_type 0.031 1.000 0.253 -0.114 0.052 0.284
## age 0.007 0.253 1.000 -0.133 -0.014 -0.064
## class_Eco -0.003 -0.114 -0.133 1.000 -0.250 0.512
## class_Eco_plus -0.011 0.052 -0.014 -0.250 1.000 0.101
## trav_type 0.009 0.284 -0.064 0.512 0.101 1.000
## distance 0.005 0.205 0.083 -0.402 -0.132 -0.275
## wifi 0.005 0.018 0.022 -0.053 0.009 -0.126
## time_conv 0.007 0.103 -0.013 0.114 0.020 0.259
## onl_book 0.006 0.036 0.025 -0.107 -0.017 -0.131
## gate_loc 0.000 -0.011 -0.005 -0.005 -0.001 -0.033
## food 0.002 0.057 0.019 -0.083 -0.019 -0.072
## onl_board -0.042 0.189 0.189 -0.291 -0.075 -0.223
## comfort -0.030 0.152 0.154 -0.211 -0.061 -0.137
## entert 0.003 0.108 0.075 -0.189 -0.053 -0.166
## onb_serv 0.004 0.069 0.070 -0.195 -0.079 -0.065
## leg_room 0.023 0.057 0.055 -0.193 -0.065 -0.140
## baggage 0.036 -0.016 -0.036 -0.148 -0.069 -0.041
## checkin 0.009 0.043 0.043 -0.131 -0.066 0.018
## infl_serv 0.038 -0.014 -0.041 -0.142 -0.064 -0.029
## clean 0.003 0.079 0.047 -0.129 -0.037 -0.090
## dep_delay 0.003 -0.010 -0.012 0.011 0.003 -0.008
## sat 0.012 0.216 0.153 -0.453 -0.111 -0.463
## distance wifi time_conv onl_book gate_loc food onl_board
## gender 0.005 0.005 0.007 0.006 0.000 0.002 -0.042
## cust_type 0.205 0.018 0.103 0.036 -0.011 0.057 0.189
## age 0.083 0.022 -0.013 0.025 -0.005 0.019 0.189
## class_Eco -0.402 -0.053 0.114 -0.107 -0.005 -0.083 -0.291
## class_Eco_plus -0.132 0.009 0.020 -0.017 -0.001 -0.019 -0.075
## trav_type -0.275 -0.126 0.259 -0.131 -0.033 -0.072 -0.223
## distance 1.000 0.011 -0.073 0.057 0.004 0.057 0.199
## wifi 0.011 1.000 0.391 0.676 0.382 0.151 0.454
## time_conv -0.073 0.391 1.000 0.517 0.524 -0.009 0.059
## onl_book 0.057 0.676 0.517 1.000 0.529 0.029 0.351
## gate_loc 0.004 0.382 0.524 0.529 1.000 -0.003 -0.001
## food 0.057 0.151 -0.009 0.029 -0.003 1.000 0.259
## onl_board 0.199 0.454 0.059 0.351 -0.001 0.259 1.000
## comfort 0.157 0.147 -0.012 0.033 0.003 0.570 0.450
## entert 0.134 0.227 -0.033 0.036 0.002 0.615 0.316
## onb_serv 0.121 0.127 0.078 0.032 -0.029 0.063 0.177
## leg_room 0.138 0.159 -0.003 0.086 -0.005 0.037 0.142
## baggage 0.074 0.119 0.082 0.023 0.003 0.039 0.105
## checkin 0.081 0.058 0.117 0.017 -0.039 0.091 0.228
## infl_serv 0.069 0.108 0.083 0.022 0.001 0.040 0.095
## clean 0.096 0.156 0.002 0.018 -0.005 0.647 0.360
## dep_delay 0.001 -0.025 -0.006 -0.012 0.005 -0.024 -0.030
## sat 0.307 0.374 -0.058 0.228 0.001 0.228 0.570
## comfort entert onb_serv leg_room baggage checkin infl_serv
## gender -0.030 0.003 0.004 0.023 0.036 0.009 0.038
## cust_type 0.152 0.108 0.069 0.057 -0.016 0.043 -0.014
## age 0.154 0.075 0.070 0.055 -0.036 0.043 -0.041
## class_Eco -0.211 -0.189 -0.195 -0.193 -0.148 -0.131 -0.142
## class_Eco_plus -0.061 -0.053 -0.079 -0.065 -0.069 -0.066 -0.064
## trav_type -0.137 -0.166 -0.065 -0.140 -0.041 0.018 -0.029
## distance 0.157 0.134 0.121 0.138 0.074 0.081 0.069
## wifi 0.147 0.227 0.127 0.159 0.119 0.058 0.108
## time_conv -0.012 -0.033 0.078 -0.003 0.082 0.117 0.083
## onl_book 0.033 0.036 0.032 0.086 0.023 0.017 0.022
## gate_loc 0.003 0.002 -0.029 -0.005 0.003 -0.039 0.001
## food 0.570 0.615 0.063 0.037 0.039 0.091 0.040
## onl_board 0.450 0.316 0.177 0.142 0.105 0.228 0.095
## comfort 1.000 0.614 0.145 0.118 0.088 0.199 0.083
## entert 0.614 1.000 0.432 0.313 0.394 0.133 0.420
## onb_serv 0.145 0.432 1.000 0.370 0.532 0.254 0.560
## leg_room 0.118 0.313 0.370 1.000 0.379 0.163 0.379
## baggage 0.088 0.394 0.532 0.379 1.000 0.242 0.642
## checkin 0.199 0.133 0.254 0.163 0.242 1.000 0.246
## infl_serv 0.083 0.420 0.560 0.379 0.642 0.246 1.000
## clean 0.676 0.688 0.134 0.107 0.108 0.185 0.102
## dep_delay -0.030 -0.034 -0.034 0.009 -0.006 -0.018 -0.058
## sat 0.375 0.438 0.349 0.344 0.273 0.245 0.268
## clean dep_delay sat
## gender 0.003 0.003 0.012
## cust_type 0.079 -0.010 0.216
## age 0.047 -0.012 0.153
## class_Eco -0.129 0.011 -0.453
## class_Eco_plus -0.037 0.003 -0.111
## trav_type -0.090 -0.008 -0.463
## distance 0.096 0.001 0.307
## wifi 0.156 -0.025 0.374
## time_conv 0.002 -0.006 -0.058
## onl_book 0.018 -0.012 0.228
## gate_loc -0.005 0.005 0.001
## food 0.647 -0.024 0.228
## onl_board 0.360 -0.030 0.570
## comfort 0.676 -0.030 0.375
## entert 0.688 -0.034 0.438
## onb_serv 0.134 -0.034 0.349
## leg_room 0.107 0.009 0.344
## baggage 0.108 -0.006 0.273
## checkin 0.185 -0.018 0.245
## infl_serv 0.102 -0.058 0.268
## clean 1.000 -0.016 0.330
## dep_delay -0.016 1.000 -0.051
## sat 0.330 -0.051 1.000
library(corrplot)
corrplot(cor(correlation_matrix))
The goal of this part was to use the k-means method to identify different clusters of passengers. Using the original data, however, we noticed that it is complicated to separate consumers into identifiable groups, so before proceeding with k-means, we transformed the data with PCA in order to reduce the number of dimensions, eliminate possible noise in the data, and generally make clustering operations easier.
Drop the response variable.
df_uns <- df %>% select(-sat)
Perform PCA on the data after scaling the variables
pca <- prcomp(df_uns, scale = TRUE)
summary(pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.039 1.5771 1.47750 1.33551 1.19443 1.07151 1.01024
## Proportion of Variance 0.189 0.1131 0.09923 0.08107 0.06485 0.05219 0.04639
## Cumulative Proportion 0.189 0.3021 0.40130 0.48237 0.54722 0.59940 0.64579
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.99898 0.96944 0.9613 0.90037 0.82297 0.69589 0.68592
## Proportion of Variance 0.04536 0.04272 0.0420 0.03685 0.03079 0.02201 0.02139
## Cumulative Proportion 0.69116 0.73388 0.7759 0.81273 0.84351 0.86552 0.88691
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.65615 0.62944 0.59448 0.56763 0.5410 0.51971 0.49718
## Proportion of Variance 0.01957 0.01801 0.01606 0.01465 0.0133 0.01228 0.01124
## Cumulative Proportion 0.90648 0.92449 0.94055 0.95520 0.9685 0.98078 0.99201
## PC22
## Standard deviation 0.41916
## Proportion of Variance 0.00799
## Cumulative Proportion 1.00000
fviz_eig(pca,
addlabels = TRUE,
choice="eigenvalue",
main="Figure 2",
ncp = 22) +
geom_hline(yintercept=1,
linetype="dashed",
color = "red") +
ggtitle("Eigenvalue of each component")
If we only had to keep the components with an eigenvalue bigger than 1, we could select the first 8 ones.
But this information is not enough, because with 8 components we are able to explain only 70% of the total variance.
pve <- 100 * pca$sdev^2 / sum(pca$sdev^2)
par(mfrow = c(1,2))
plot(pve, type = 'o', ylab = 'PVE',
xlab = "Principal Component", col = "blue")
plot(cumsum(pve), type = "o", ylab = "Cumulative PVE",
xlab = "Principal Component", col = "brown3")
Considering this trade-off we choose to keep the first eleven components.
The next step is to determine the optimal number of clusters.
# Determine number of clusters
wss <- (nrow(df_uns)-1)*sum(apply(df_uns,2,var))
for (i in 2:10) wss[i] <- sum(kmeans(df_uns,
centers=i)$withinss)
plot(1:10, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")
From the graph it seems that we can choose a number of clusters equal to 3.
Now that we have selected our hyperparameters, we can apply k-means on the first eleven components of the PCA.
km <- kmeans(pca$x[, 1:11], 3, nstart = 20)
plot(pca$x[,1:2], col = alpha(km$cluster + 1, 0.5), pch = 20, cex = .2)
Now that we have grouped the passengers into the different groups, we can take a closer look to them.
Add the cluster column to the original df, then plot the satisfaction rate for different clusters:
clust <- names(sort(table(km$clust)))
df$cluster = km$clust
ggplot(df, aes(x=cluster)) + geom_bar(aes(fill = as.factor(sat))) + scale_fill_manual(values = c("red", "green"), labels = c("neutral or dissatisfied", "satisfied")) +
labs(fill = "Satisfaction")
Taking into consideration the percentage of satisfied passengers in each group we note that this varies considerably. It may therefore be useful for the company to analyze what causes this difference.
aggregate(df,by=list(df$cluster),FUN=mean)
Remove the cluster column:
df <- df %>% select(-cluster)
split_train_test <- createDataPartition(df$sat, p=0.8, list=FALSE)
dtrain <- df[split_train_test,]
dtest <- df[-split_train_test,]
logistic_model <-glm(sat ~., data=dtrain , family="binomial" )
summary <- summary (logistic_model)
round(summary$coefficients, digits = 3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -12.125 0.112 -108.264 0.000
## gender 0.071 0.024 2.990 0.003
## cust_type 2.793 0.040 69.389 0.000
## age -0.002 0.001 -2.788 0.005
## class_Eco -0.788 0.033 -24.233 0.000
## class_Eco_plus -1.079 0.052 -20.931 0.000
## trav_type -3.361 0.041 -82.225 0.000
## distance 0.000 0.000 1.364 0.173
## wifi 0.773 0.015 53.180 0.000
## time_conv -0.337 0.014 -24.713 0.000
## onl_book 0.384 0.015 24.833 0.000
## gate_loc -0.256 0.013 -19.173 0.000
## food -0.054 0.013 -4.226 0.000
## onl_board 0.918 0.014 66.769 0.000
## comfort 0.036 0.014 2.609 0.009
## entert 0.079 0.018 4.482 0.000
## onb_serv 0.366 0.013 28.777 0.000
## leg_room 0.321 0.011 29.434 0.000
## baggage 0.166 0.014 11.643 0.000
## checkin 0.367 0.010 35.139 0.000
## infl_serv 0.169 0.015 11.315 0.000
## clean 0.265 0.014 18.612 0.000
## dep_delay -0.004 0.000 -13.628 0.000
Predict the points of the test set using the Logistic Model above implemented:
logistic_prediction <- predict(logistic_model, dtest, type="response")
Confusion Matrix to evaluate the goodness:
# Set a threshold for classification
threshold <- 0.5
# Convert the predicted probabilities (through logistic model) to binary predictions based on the threshold
logistic_prediction_binary <- ifelse(logistic_prediction > threshold, 1, 0)
# Create a table of predicted values vs. actual values
confusion_matrix <- table(logistic_prediction_binary, dtest$sat)
# Print the confusion table
print(confusion_matrix)
##
## logistic_prediction_binary 0 1
## 0 12511 1248
## 1 1186 8968
print(mean(logistic_prediction_binary==dtest$sat))
## [1] 0.8982144
We obtained a fraction of ~90% observation of the test set correctly predicted.
Plot the ROC Curve
library(pROC)
test_roc = roc(dtest$sat ~ logistic_prediction, plot = TRUE, print.auc = TRUE)
Build the LDA model
library(MASS)
lda_model <- lda(sat~.,data = dtrain)
lda_model
## Call:
## lda(sat ~ ., data = dtrain)
##
## Prior probabilities of groups:
## 0 1
## 0.5733163 0.4266837
##
## Group means:
## gender cust_type age class_Eco class_Eco_plus trav_type distance
## 0 0.4872721 0.7696025 37.80018 0.6340810 0.09799416 0.49266958 960.8546
## 1 0.5026216 0.9310776 42.52342 0.1790562 0.03956976 0.06056745 1580.9135
## wifi time_conv onl_book gate_loc food onl_board comfort entert
## 0 2.414187 3.275128 2.622192 2.985212 2.953702 2.706692 3.030051 2.878683
## 1 3.363258 3.118023 3.223624 2.992674 3.564782 4.167002 4.030823 4.053487
## onb_serv leg_room baggage checkin infl_serv clean dep_delay
## 0 2.997392 2.998760 3.360485 3.024872 3.376276 2.920058 16.37097
## 1 3.905719 3.892806 4.006517 3.656294 4.008085 3.797961 12.64270
##
## Coefficients of linear discriminants:
## LD1
## gender 4.974355e-02
## cust_type 1.359423e+00
## age -2.201605e-03
## class_Eco -4.248463e-01
## class_Eco_plus -5.350976e-01
## trav_type -1.543498e+00
## distance 2.175575e-05
## wifi 3.460955e-01
## time_conv -9.898759e-02
## onl_book -4.537412e-02
## gate_loc -4.895115e-02
## food -2.987154e-02
## onl_board 3.824787e-01
## comfort 2.619206e-02
## entert 8.286813e-02
## onb_serv 1.421774e-01
## leg_room 1.447452e-01
## baggage 7.431393e-02
## checkin 1.409841e-01
## infl_serv 6.997590e-02
## clean 1.049909e-01
## dep_delay -1.490475e-03
Plot the coefficients of LDA Model.
plot(lda_model)
Predict the points of the test set using the LDA Model above implemented:
lda_prediction = predict(lda_model, dtest)
lda_class_prediction = lda_prediction$class
Output the confusion matrix
table(lda_class_prediction, dtest$sat)
##
## lda_class_prediction 0 1
## 0 12388 1222
## 1 1309 8994
mean(lda_class_prediction==dtest$sat)
## [1] 0.894158
We see that we correctly predicted 89% of the labels of the test set.
ROC for LDA
lda_probabilities_prediction = lda_prediction$posterior[, "1"] # either write 0 or 1!
# given the output produced by predict() we only take the "posterior" column, which gives us the predicted probabilities [0,1] that we need to build the ROC.
# to understand this: if you print "predict(lda_model, dtest)$posterior" we can see that we have two columns, which are complementary. So, we just need one column of them.
test_roc = roc(dtest$sat ~ lda_probabilities_prediction, plot = TRUE, print.auc = TRUE)
qda_model <- qda(sat~.,data = dtrain)
qda_model
## Call:
## qda(sat ~ ., data = dtrain)
##
## Prior probabilities of groups:
## 0 1
## 0.5733163 0.4266837
##
## Group means:
## gender cust_type age class_Eco class_Eco_plus trav_type distance
## 0 0.4872721 0.7696025 37.80018 0.6340810 0.09799416 0.49266958 960.8546
## 1 0.5026216 0.9310776 42.52342 0.1790562 0.03956976 0.06056745 1580.9135
## wifi time_conv onl_book gate_loc food onl_board comfort entert
## 0 2.414187 3.275128 2.622192 2.985212 2.953702 2.706692 3.030051 2.878683
## 1 3.363258 3.118023 3.223624 2.992674 3.564782 4.167002 4.030823 4.053487
## onb_serv leg_room baggage checkin infl_serv clean dep_delay
## 0 2.997392 2.998760 3.360485 3.024872 3.376276 2.920058 16.37097
## 1 3.905719 3.892806 4.006517 3.656294 4.008085 3.797961 12.64270
Predict the points of the test set using the QDA Model above implemented:
qda_prediction = predict(qda_model, dtest)
qda_class_prediction = qda_prediction$class
Output the confusion matrix
table(qda_class_prediction, dtest$sat)
##
## qda_class_prediction 0 1
## 0 12240 1700
## 1 1457 8516
mean(qda_class_prediction==dtest$sat)
## [1] 0.8679798
Plot the ROC Curve
qda_probabilities_prediction <- qda_prediction$posterior[,"1"]
test_roc = roc(dtest$sat ~ qda_probabilities_prediction, plot = TRUE, print.auc = TRUE)
library(class)
# create Data Domain X for the TRAINING set
train_X = dtrain[-23]
# create Data Domain X for the TEST set
test_X= dtest[-23]
# labels of the training set
train_Y = dtrain$sat
# labels of the test set
test_Y = dtest$sat
library(ggplot2)
# Define the range of k values to evaluate
k_values <- c(1,3,5,7,9)
# Initialize an empty vector to store the accuracies
accuracies <- c()
# Compute the accuracy for each k value
for (k in k_values) {
knn_prediction <- knn(train_X, test_X, train_Y, k = k)
accuracy <- mean(knn_prediction == test_Y)
accuracies <- c(accuracies, accuracy)
}
# Create a data frame with the k values and accuracies
acc_data <- data.frame(k = k_values, accuracy = accuracies)
# Plot the elbow plot
ggplot(acc_data, aes(x = k, y = accuracy)) +
geom_line() +
geom_point() +
labs(x = "k", y = "Accuracy") +
ggtitle(" Accuracy for KNN")
Error Rate plot
# Initialize an empty vector to store the error rates
error_rates <- c()
k_values <- c(1,3,5,7,9)
for (i in acc_data$accuracy){
error_rate <- 1 - i
error_rates <- c(error_rates, error_rate)
}
# Create a data frame with the k values and error rates
err_data <- data.frame(k = k_values, error_rate = error_rates)
# Plot the error rate plot
ggplot(err_data, aes(x = k, y = error_rate)) +
geom_line() +
geom_point() +
labs(x = "k", y = "Error Rate") +
ggtitle("Error Rate Plot for KNN")
WE CHOOSE K=5
in the following chunk, we add prob=TRUE because we need the estimated probabilities to build the ROC later on.
five_nn_prediction <- knn(train_X, test_X, train_Y, k = 5, prob=TRUE)
confusion_matrix_knn <- table(test_Y, five_nn_prediction)
confusion_matrix_knn
## five_nn_prediction
## test_Y 0 1
## 0 11189 2508
## 1 2846 7370
mean(five_nn_prediction==test_Y)
## [1] 0.776105
library(rpart)
# Convert the response variable to a factor
dtrain$sat <- as.factor(dtrain$sat)
dtest$sat <- as.factor(dtest$sat)
# Train the rpart model
tree_model <- rpart(sat ~ ., data = dtrain)
# Make predictions on the test set
predictions <- predict(tree_model, newdata = dtest, type = "class")
accuracy <- sum(predictions == dtest$sat) / nrow(dtest) * 100
# Print the accuracy
cat("Accuracy: ", accuracy, "%\n")
## Accuracy: 89.47434 %
library(rpart.plot)
rpart.plot(tree_model)
# Calculate test error rates
error_logreg <- mean(logistic_prediction_binary != test_Y)
error_lda <- mean(lda_class_prediction != test_Y)
error_qda <- mean(qda_class_prediction != test_Y)
error_5nn <- mean(five_nn_prediction != test_Y)
error_tree <- mean(predictions != test_Y)
# Create a data frame with method and error rate information
dataaaa <- data.frame(Method = c("Logistic Regression", "LDA", "QDA", "5-NN","TREE"),
Error_Rate = c(error_logreg, error_lda, error_qda, error_5nn, error_tree))
# Plot barplot
ggplot(dataaaa, aes(x = Method, y = Error_Rate)) +
geom_bar(stat = "identity", fill = "Red") +
ylab("Test Error Rate") +
ggtitle("Barplot of Test Error Rates") +
theme_bw()